from langchain_community.document_loaders import S3DirectoryLoader
import logging
import boto3
import os
from urllib.parse import urlparse

def get_s3_files_info(s3_url,aws_access_key_id=None,aws_secret_access_key=None):
  try:
      # Extract bucket name and directory from the S3 URL
      parsed_url = urlparse(s3_url)
      bucket_name = parsed_url.netloc
      directory = parsed_url.path.lstrip('/')
      try:
        # Connect to S3
        s3 = boto3.client('s3',aws_access_key_id=aws_access_key_id,aws_secret_access_key=aws_secret_access_key)

        # List objects in the specified directory
        response = s3.list_objects_v2(Bucket=bucket_name, Prefix=directory)
      except Exception as e:
         raise Exception("Invalid AWS credentials")
      
      files_info = []

      # Check each object for file size and type
      for obj in response.get('Contents', []):
          file_key = obj['Key']
          file_name = os.path.basename(file_key)
          logging.info(f'file_name : {file_name}  and file key : {file_key}')
          file_size = obj['Size']

          # Check if file is a PDF
          if file_name.endswith('.pdf'):
            files_info.append({'file_key': file_key, 'file_size_bytes': file_size})
            
      return files_info
  except Exception as e:
    error_message = str(e)
    logging.error(f"Error while reading files from s3: {error_message}")
    raise Exception(error_message)


def get_s3_pdf_content(s3_url,aws_access_key_id=None,aws_secret_access_key=None):
    try:
      # Extract bucket name and directory from the S3 URL
        parsed_url = urlparse(s3_url)
        bucket_name = parsed_url.netloc
        logging.info(f'bucket name : {bucket_name}')
        directory = parsed_url.path.lstrip('/')
        if directory.endswith('.pdf'):
          loader=S3DirectoryLoader(bucket_name, prefix=directory,aws_access_key_id=aws_access_key_id,aws_secret_access_key=aws_secret_access_key)
          pages = loader.load_and_split()
          return pages
        else:
          return None
    
    except Exception as e:
        logging.error(f"getting error while reading content from s3 files:{e}")
        raise Exception(e)


def get_documents_from_s3(s3_url, aws_access_key_id, aws_secret_access_key):
    try:
      parsed_url = urlparse(s3_url)
      bucket = parsed_url.netloc
      file_key = parsed_url.path.lstrip('/')
      file_name=file_key.split('/')[-1]
      s3=boto3.client('s3',aws_access_key_id=aws_access_key_id,aws_secret_access_key=aws_secret_access_key)
      response=s3.head_object(Bucket=bucket,Key=file_key)
      file_size=response['ContentLength']
      
      logging.info(f'bucket : {bucket},file_name:{file_name},  file key : {file_key},  file size : {file_size}')
      pages=get_s3_pdf_content(s3_url,aws_access_key_id=aws_access_key_id,aws_secret_access_key=aws_secret_access_key)
      return file_name,pages
    except Exception as e:
      error_message = str(e)
      logging.exception(f'Exception in reading content from S3:{error_message}')
      raise Exception(error_message)    